Visualize the empirical distribution function of count in each individual. We are interested to learn distribution of UMI counts without log transformation. When we visualize density distributions on a log scale, there appear to be multiple modes in the distribution. The mode with the lowest expression value corresponds to cells with zero UMI count.
For simplicity, here I selected a single replicate from each individaul cell line and compare count distributions across individuals.
We didn’t observe bimodality of gene expression of the 16 pluripotent genes in our data. This could be due to the fact that our iPSC cell lines are controlled in an undifferentiated and homogeneous state. In the case when differentiation is expected in the cells, we should see a pattern of bimodality in expression distribution.
Density plot is not a useful tool in visuazling count distribution. They create a visual illusion of multiple modes in the distribution when there are big increase or decrease in frequency of the counts.
Replicates are consisent in count distributions in each individual cell line.
library(Humanzee)
Import list of pluripotent genes.
pluripotent_genes <- read.table("../data/gilad-2015/pluripotency-genes.txt",
header = TRUE,
stringsAsFactors = FALSE, sep = "\t")
head(pluripotent_genes)
## From To Species
## 1 GDF3 ENSG00000184344 Homo sapiens
## 2 ZFP42 ENSG00000179059 Homo sapiens
## 3 NR6A1 ENSG00000148200 Homo sapiens
## 4 SOX2 ENSG00000181449 Homo sapiens
## 5 CDKN2A ENSG00000147889 Homo sapiens
## 6 POU5F1 ENSG00000204531 Homo sapiens
## Gene.Name
## 1 growth differentiation factor 3
## 2 zinc finger protein 42 homolog (mouse)
## 3 nuclear receptor subfamily 6, group A, member 1
## 4 SRY (sex determining region Y)-box 2
## 5 cyclin-dependent kinase inhibitor 2A (melanoma, p16, inhibits CDK4)
## 6 POU class 5 homeobox 1
Import filtered molecule counts
molecules_filter <- read.table("../data/gilad-2015/molecules-filter.txt",
header = TRUE,
stringsAsFactors = FALSE)
which_pluripotent <- which(rownames(molecules_filter) %in% pluripotent_genes$To)
molecules_pluripotent <- molecules_filter[which_pluripotent, ]
Subset pluripotent genes that are mapped in our data.
pluripotent_subset <- pluripotent_genes[
which(pluripotent_genes$To %in% rownames(molecules_filter)), ]
Import annotations of the filtered data
anno_filter <- read.table("../data/gilad-2015/annotation-filter.txt",
header = TRUE,
stringsAsFactors = FALSE)
dim(anno_filter)
## [1] 560 5
Plot for a single replicate and all three individuals.
anno_filter_r3 <- anno_filter[ which(anno_filter$replicate == "r3"), ]
gene_counts_r3 <- molecules_pluripotent[ , anno_filter_r3$replicate == "r3"]
ngenes <- dim(gene_counts_r3)[1]
par(mfrow = c(1,3))
for (which_gene_view in c(1:ngenes)) {
cols <- c("red", "green", "blue")
individuals <- unique(anno_filter$individual)
## ECDF of all three individuals
plot( ecdf( unlist(
gene_counts_r3[ which_gene_view,
anno_filter_r3$individual == individuals[1] ] ) ),
main = "", col = cols[1] )
for (ii_individual in c(2:length(individuals))) {
lines( ecdf( unlist( gene_counts_r3[ which_gene_view,
anno_filter_r3$individual == individuals[ii_individual] ]) ),
col = cols[ii_individual] )
}
## Density plot of all three individuals
plot_density_overlay(gene_counts_r3,
annotation = anno_filter_r3,
which_gene = rownames(gene_counts_r3)[which_gene_view],
gene_symbols = NULL,
labels = "")
## Simple bar plot of counts
count_lims <- table( unlist( gene_counts_r3[ which_gene_view, ] ) )
ylims <- c(0, max(count_lims))
xlims <- range(as.numeric(names(count_lims)))
plot( table( unlist(
gene_counts_r3[ which_gene_view,
anno_filter_r3$individual == individuals[1] ] ) ),
type = "h", xlab = "Count", ylab = "Frequency",
main = "", col = alpha( cols[1], .5),
xlim = xlims, ylim = ylims, axes = F )
axis(1); axis(2)
for (ii_individual in c(2:length(individuals))) {
points( table( unlist(
gene_counts_r3[ which_gene_view,
anno_filter_r3$individual == individuals[ii_individual] ] ) ),
type = "h",
col = alpha( cols[ii_individual], .5) )
}
}
## Warning: package 'broman' was built under R version 3.2.3
Plot for a single replicate and all three individuals.
anno_filter_r3 <- anno_filter[ which(anno_filter$replicate == "r3"), ]
gene_counts_r3 <- molecules_pluripotent[ , anno_filter_r3$replicate == "r3"]
ngenes <- dim(gene_counts_r3)[1]
par(mfrow = c(3,3))
for (which_gene_view in c(1:ngenes)) {
cols <- c("red", "green", "blue")
individuals <- unique(anno_filter$individual)
replicates <- unique(anno_filter$replicate)[order(unique(anno_filter$replicate))]
## Simple bar plot of counts
count_lims <- lapply(1:length(individuals), function(which_individual) {
ind_lims <- lapply(1:length(replicates), function(which_replicate) {
replicate_lims <- unlist( table( as.matrix(
molecules_pluripotent[ which_gene_view,
anno_filter$individual == individuals[which_individual] &
anno_filter$replicate == replicates[which_replicate] ] ) ) )
if (length(replicate_lims) != 0) {
list(ylims = max(replicate_lims),
xlims = range(as.numeric(names(replicate_lims)) ) )
} else {
list(ylims = 0,
xlims = 0)
}
})
list(ylims = sapply(ind_lims, "[[", 1),
xlims = sapply(ind_lims, "[[", 2))
})
ylims <- c(0, max( unlist(sapply(count_lims, "[[", 1)) ) )
xlims <- range( unlist(sapply(count_lims, "[[", 2)) )
for (ii_individual in 1:length(individuals)) {
df_ind <- molecules_pluripotent[ , anno_filter$individual == individuals[ii_individual]]
anno_ind <- anno_filter[anno_filter$individual == individuals[ii_individual], ]
for (ii_replicate in 1:length(replicates)) {
if (sum(anno_ind$replicate == replicates[ii_replicate]) != 0) {
plot( table( unlist(
df_ind[ which_gene_view, anno_ind$replicate == replicates[ii_replicate]] ) ),
type = "h", xlab = "Count", ylab = "Frequency",
main = "", col = alpha( cols[ii_individual], 1),
xlim = xlims, ylim = ylims)
} else {
plot(0, pch = "", axes = F, xlab = "", ylab = "")
}
title(paste(individuals[ii_individual],
replicates[ii_replicate], sep = "."))
}
}
title(rownames(molecules_pluripotent)[which_gene_view],
outer = TRUE,
line = -1)
}
library(knitr)
## Warning: package 'knitr' was built under R version 3.2.3
kable(pluripotent_subset)
| From | To | Species | Gene.Name | |
|---|---|---|---|---|
| 1 | GDF3 | ENSG00000184344 | Homo sapiens | growth differentiation factor 3 |
| 2 | ZFP42 | ENSG00000179059 | Homo sapiens | zinc finger protein 42 homolog (mouse) |
| 3 | NR6A1 | ENSG00000148200 | Homo sapiens | nuclear receptor subfamily 6, group A, member 1 |
| 4 | SOX2 | ENSG00000181449 | Homo sapiens | SRY (sex determining region Y)-box 2 |
| 6 | POU5F1 | ENSG00000204531 | Homo sapiens | POU class 5 homeobox 1 |
| 10 | MYC | ENSG00000136997 | Homo sapiens | v-myc myelocytomatosis viral oncogene homolog (avian) |
| 11 | DNMT3B | ENSG00000088305 | Homo sapiens | DNA (cytosine-5-)-methyltransferase 3 beta |
| 12 | TERT | ENSG00000164362 | Homo sapiens | telomerase reverse transcriptase |
| 14 | FOXD3 | ENSG00000187140 | Homo sapiens | forkhead box D3 |
| 16 | DPPA5 | ENSG00000203909 | Homo sapiens | developmental pluripotency associated 5 |
| 17 | NANOG | ENSG00000111704 | Homo sapiens | Nanog homeobox pseudogene 8; Nanog homeobox |
| 19 | PODXL | ENSG00000128567 | Homo sapiens | podocalyxin-like |
| 21 | DPPA4 | ENSG00000121570 | Homo sapiens | developmental pluripotency associated 4 |
| 23 | DPPA2 | ENSG00000163530 | Homo sapiens | developmental pluripotency associated 2 |
| 24 | CDKN1A | ENSG00000124762 | Homo sapiens | cyclin-dependent kinase inhibitor 1A (p21, Cip1) |
| 25 | SALL4 | ENSG00000101115 | Homo sapiens | sal-like 4 (Drosophila) |
sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.12.3 broman_0.62-1 scales_0.3.0 Humanzee_0.1.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.3 assertthat_0.1 digest_0.6.9 plyr_1.8.3
## [5] formatR_1.2.1 magrittr_1.5 evaluate_0.8 highr_0.5.1
## [9] stringi_1.0-1 rmarkdown_0.9.2 tools_3.2.1 stringr_1.0.0
## [13] munsell_0.4.3 yaml_2.1.13 colorspace_1.2-6 htmltools_0.3